1 Data Cleaning

# Load packages
library(dplyr)
library(e1071)

# Load the data
data <- read.csv("../who_data.csv")

# Remove irrelevant columns
who_data <- subset(data, select=-c(X,X.1,X.2,X.3,who_ms))

# Replace literal "NA" strings with proper R NA
who_data[who_data == "NA"] <- NA

# Convert numeric-like columns stored as character to numeric
num_cols <- c("pm10_concentration","pm25_concentration","no2_concentration",
              "pm10_tempcov","pm25_tempcov","no2_tempcov","population")
who_data[num_cols] <- lapply(who_data[num_cols], as.numeric)

# Convert categorical variables to correct type
who_data$type_of_stations <- as.character(who_data$type_of_stations)
who_data$who_region <- as.factor(who_data$who_region)

# Filter for European countries and remove pm25 column
europe_data <- who_data %>%
  filter(who_region == "4_Eur") %>%
  select(-pm25_concentration)

# Remove rows with missing values
clean_data_europe <- na.omit(europe_data)

summary(clean_data_europe)
##    who_region       iso3           country_name           city          
##  4_Eur  :4106   Length:4106        Length:4106        Length:4106       
##         :   0   Class :character   Class :character   Class :character  
##  1_Afr  :   0   Mode  :character   Mode  :character   Mode  :character  
##  2_Amr  :   0                                                           
##  3_Sear :   0                                                           
##  5_Emr  :   0                                                           
##  (Other):   0                                                           
##       year      pm10_concentration no2_concentration  pm10_tempcov   
##  Min.   :2013   Min.   : 6.045     Min.   :  1.052   Min.   :  0.00  
##  1st Qu.:2015   1st Qu.:17.479     1st Qu.: 17.651   1st Qu.: 92.00  
##  Median :2017   Median :20.991     Median : 23.401   Median : 96.00  
##  Mean   :2017   Mean   :22.648     Mean   : 24.314   Mean   : 91.17  
##  3rd Qu.:2019   3rd Qu.:25.958     3rd Qu.: 29.865   3rd Qu.: 99.00  
##  Max.   :2022   Max.   :88.000     Max.   :110.432   Max.   :100.00  
##                                                                      
##   pm25_tempcov     no2_tempcov     type_of_stations     population      
##  Min.   :  0.00   Min.   :  0.00   Length:4106        Min.   :     199  
##  1st Qu.: 89.00   1st Qu.: 93.00   Class :character   1st Qu.:   95430  
##  Median : 96.00   Median : 96.00   Mode  :character   Median :  173985  
##  Mean   : 88.24   Mean   : 93.57                      Mean   :  434184  
##  3rd Qu.: 98.00   3rd Qu.: 99.00                      3rd Qu.:  377134  
##  Max.   :100.00   Max.   :100.00                      Max.   :15190336  
##                                                                         
##     latitude        longitude      
##  Min.   :-20.89   Min.   :-61.536  
##  1st Qu.: 43.53   1st Qu.:  1.724  
##  Median : 48.52   Median :  9.187  
##  Mean   : 47.36   Mean   :  8.979  
##  3rd Qu.: 51.50   3rd Qu.: 15.642  
##  Max.   : 69.67   Max.   : 55.465  
## 
# For categorial variables
clean_data_europe$country_name <- as.factor(clean_data_europe$country_name)
clean_data_europe$type_of_stations <- as.factor(clean_data_europe$type_of_stations)
clean_data_europe$city <- as.factor(clean_data_europe$city)


# Split data into training (2010-2019), validation (2020) and testing (2021)
train_data <- filter(clean_data_europe, year <= 2019)
valid_data <- filter(clean_data_europe, year == 2020)
test_data  <- filter(clean_data_europe, year == 2021)

2 Support Vector Machine (SVM)

Support Vector Machines are supervised learning models used for both classification and regression problems. They work by finding an optimal boundary, known as a hyperplane, that best separates or predicts data points. The observations that lie closest to this boundary are called support vectors, and they are the most influential in defining the model.

There are two main types of Support Vector Machines:
- Support Vector Classification (SVC) which used when the target variable is categorical
- Support Vector Regression (SVR) which is used when the target variable is continuous

2.1 Model used in this project

Since we aim to predict PM₁₀ pollutant concentrations, which are continuous values, we use the Support Vector Regression (SVR) model.
This model finds a smooth function that predicts PM₁₀ levels but SVR cannot handle missing values directly, so rows containing NA’s must be removed or imputed before training the model.

Support Vector Regression (SVR) could be useful for environmental data because:
- It can deal with non-linear relationships.
- It is quite robust to noise and outliers.
- It can use several explanatory variables at once to improve predictions.

Even though SVR has these advantages, we will also try other regression and machine learning models to see which one predicts PM₁₀ levels across European countries best.

We will try different kernels in SVR to see which works best for predicting PM₁₀ levels.

  1. Linear Kernel
    • Simple and fast.
    • Good if the relationship between features and PM₁₀ is roughly linear.
  2. Polynomial Kernel
    • Can capture curved relationships in the data.
    • Start with low degrees (e.g., 2 or 3) to avoid overfitting.
  3. RBF (Radial Basis Function) Kernel
    • Can model complex, non-linear patterns.
    • Default choice if the relationship is unknown.
    • Gamma parameter may need tuning for best performance.

We will start with simple kernels and increase complexity to find the best model for PM₁₀ predictions.

2.2 Library requirements

In order to run our code, the following libraries are required:

# Install packages if needed
if(!require("ggplot2")) install.packages("ggplot2")
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:e1071':
## 
##     element
if(!require("dplyr")) install.packages("dplyr")
if(!require("e1071")) install.packages("e1071")
if(!require("ggrepel")) install.packages("ggrepel")
## Loading required package: ggrepel
if(!require("kableExtra")) install.packages("kableExtra")
## Loading required package: kableExtra
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
if(!require("plotly")) install.packages("plotly")
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
if(!require("caret")) install.packages("caret")
## Loading required package: caret
## Loading required package: lattice
if(!require("stringi")) install.packages("stringi")
## Loading required package: stringi
# We load the libraries we will need throughout the project:
library(ggplot2)
library(dplyr)
library(e1071)
library(ggrepel)
library(kableExtra)
library(plotly)
library(caret)
library(stringi)

3 Linear Kernal SVR

# Fit linear SVR on training data (2010–2019)
svr_linear_model <- svm(
  pm10_concentration ~ year * no2_concentration + population + latitude + longitude,
  data = train_data,
  type = "eps-regression",
  kernel = "linear"
)

# Predict on validation data (2020)
predictions_linear_valid <- predict(svr_linear_model, newdata = valid_data)

# Evaluate performance on validation data
ggplot(valid_data, aes(x = pm10_concentration, y = predictions_linear_valid)) +
  geom_point(color = "blue", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Actual PM10", y = "Predicted PM10", title = "Linear SVR Predictions vs Actual (Validation 2020)") +
  theme_minimal()

# Create UTF-8 copies for tooltips only
tooltip_city <- iconv(valid_data$city, from = "", to = "UTF-8", sub = "?")
tooltip_country <- iconv(valid_data$country_name, from = "", to = "UTF-8", sub = "?")

# Interractive plot

p_valid <- ggplot(valid_data, aes(
  x = pm10_concentration,
  y = predictions_linear_valid,
  text = I(paste0("City: ", tooltip_city, "<br>Country: ", tooltip_country))
)) +
  geom_point(color = "blue", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(
    x = "Actual PM10",
    y = "Predicted PM10",
    title = "Linear SVR Predictions vs Actual (Validation 2020)"
  ) +
  theme_minimal()

ggplotly(p_valid, tooltip = "text")

3.1 Model Evaluation Metrics

  • RMSE (Root Mean Squared Error): Measures typical prediction error in µg/m³, penalising large errors. Lower values are better.
  • MAE (Mean Absolute Error): Average absolute difference between predicted and actual PM₁₀. Less sensitive to outliers than RMSE. Lower is better.
  • R² (R-squared): Proportion of variance in PM₁₀ explained by the model. Ranges from 0 to 1, higher is better.
# Actual PM10 values for validation data
actuals_linear_valid <- valid_data$pm10_concentration

# Root Mean Squared Error (RMSE)
rmse_linear_valid <- sqrt(mean((predictions_linear_valid - actuals_linear_valid)^2))

# Mean Absolute Error (MAE)
mae_linear_valid <- mean(abs(predictions_linear_valid - actuals_linear_valid))

# R-squared (proportion of variance explained)
r_squared_linear_valid <- 1 - sum((predictions_linear_valid - actuals_linear_valid)^2) / 
                         sum((actuals_linear_valid - mean(actuals_linear_valid))^2)

# Print all metrics
cat("RMSE (Linear SVR - Validation):", rmse_linear_valid, "\n")
## RMSE (Linear SVR - Validation): 5.399075
cat("MAE (Linear SVR - Validation):", mae_linear_valid, "\n")
## MAE (Linear SVR - Validation): 3.818252
cat("R² (Linear SVR - Validation):", r_squared_linear_valid, "\n")
## R² (Linear SVR - Validation): 0.4219859

4 Polynomial Kernel SVR

# Fit a polynomial SVR model on training data (2010–2019)
svr_poly_model <- svm(
  pm10_concentration ~ year * no2_concentration + population + latitude + longitude,
  data = train_data,
  type = "eps-regression",
  kernel = "polynomial",
  degree = 2,   # adjust depending on nonlinearity
  coef0 = 1     # constant term in the polynomial kernel
)

# Predict PM10 for validation data (2020)
predictions_poly_valid <- predict(svr_poly_model, newdata = valid_data)

# Evaluate performance on validation data
ggplot(valid_data, aes(x = pm10_concentration, y = predictions_poly_valid)) +
  geom_point(color = "purple", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Actual PM10", y = "Predicted PM10", title = "Polynomial SVR Predictions vs Actual (Validation 2020)") +
  theme_minimal()

# Interactive plot
p_poly_valid <- ggplot(valid_data, aes(
  x = pm10_concentration,
  y = predictions_poly_valid,
  text = I(paste0("City: ", tooltip_city, "<br>Country: ", tooltip_country))
)) +
  geom_point(color = "purple", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(
    x = "Actual PM10",
    y = "Predicted PM10",
    title = "Polynomial SVR Predictions vs Actual (Validation 2020)"
  ) +
  theme_minimal()

ggplotly(p_poly_valid, tooltip = "text")

4.1 Model Evaluation

# Actual PM10 values for validation data
actuals_poly_valid <- valid_data$pm10_concentration

# Root Mean Squared Error (RMSE)
rmse_poly_valid <- sqrt(mean((predictions_poly_valid - actuals_poly_valid)^2))

# Mean Absolute Error (MAE)
mae_poly_valid <- mean(abs(predictions_poly_valid - actuals_poly_valid))

# R-squared (proportion of variance explained)
r2_poly_valid <- 1 - sum((predictions_poly_valid - actuals_poly_valid)^2) / 
                 sum((actuals_poly_valid - mean(actuals_poly_valid))^2)

# Print all metrics
cat("RMSE (Polynomial - Validation):", rmse_poly_valid, "\n")
## RMSE (Polynomial - Validation): 4.709768
cat("MAE (Polynomial - Validation):", mae_poly_valid, "\n")
## MAE (Polynomial - Validation): 3.581697
cat("R² (Polynomial - Validation):", r2_poly_valid, "\n")
## R² (Polynomial - Validation): 0.5601561

Our model is already better than the linear one

4.2 Hyperparameter tuning

library(e1071)
library(dplyr)

# Hyperparameter grid
grid <- expand.grid(
  degree = c(2, 3, 4),
  coef0  = c(0, 1, 2)
)

# Create an empty results table
results <- data.frame(
  degree = integer(),
  coef0 = numeric(),
  RMSE = numeric(),
  MAE = numeric(),
  R2 = numeric()
)

# Loop through all combinations
for (i in 1:nrow(grid)) {
  d <- grid$degree[i]
  c0 <- grid$coef0[i]
  
  # Fit polynomial SVR on training data
  model <- svm(
    pm10_concentration ~ year * no2_concentration + population + latitude + longitude,
    data = train_data,
    type = "eps-regression",
    kernel = "polynomial",
    degree = d,
    coef0 = c0
  )
  
  # Predict on validation data
  pred_valid <- predict(model, newdata = valid_data)
  actuals <- valid_data$pm10_concentration
  
  # Compute metrics
  rmse <- sqrt(mean((pred_valid - actuals)^2))
  mae  <- mean(abs(pred_valid - actuals))
  r2   <- 1 - sum((pred_valid - actuals)^2) / sum((actuals - mean(actuals))^2)
  
  # Store results in table
  results <- rbind(results, data.frame(
    degree = d,
    coef0  = c0,
    RMSE   = rmse,
    MAE    = mae,
    R2     = r2
  ))
}

# Arrange by RMSE ascending
results <- results %>% arrange(RMSE)

# Show the table
print(results)
##   degree coef0     RMSE      MAE         R2
## 1      3     2 4.678480 3.421604  0.5659805
## 2      3     1 4.678512 3.420927  0.5659746
## 3      2     1 4.709768 3.581697  0.5601561
## 4      2     2 4.709824 3.581366  0.5601456
## 5      4     1 5.418156 3.638885  0.4178932
## 6      4     2 5.447603 3.637149  0.4115488
## 7      4     0 6.006909 4.354437  0.2845130
## 8      2     0 6.330398 5.145282  0.2053761
## 9      3     0 7.990291 5.482813 -0.2659739

4.3 Final polynomial model

# Fit polynomial SVR on training data (2010–2019) using best hyperparameters
svr_poly_model_best  <- svm(
  pm10_concentration ~ year * no2_concentration + population + latitude + longitude,
  data = train_data,
  type = "eps-regression",
  kernel = "polynomial",
  degree = 3,   # best from validation
  coef0 = 2     # best from validation
)

# Predict PM10 for validation data (2020)
predictions_poly_valid_best <- predict(svr_poly_model_best, newdata = valid_data)

# Static plot
ggplot(valid_data, aes(x = pm10_concentration, y = predictions_poly_valid_best)) +
  geom_point(color = "purple", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Actual PM10", y = "Predicted PM10",
       title = "Polynomial SVR Predictions vs Actual (Validation 2020)") +
  theme_minimal()

# Interactive plot with tooltips
p_poly_valid <- ggplot(valid_data, aes(
  x = pm10_concentration,
  y = predictions_poly_valid_best,
  text = paste0("City: ", tooltip_city, "<br>Country: ", tooltip_country)
)) +
  geom_point(color = "purple", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Actual PM10", y = "Predicted PM10",
       title = "Polynomial SVR Predictions vs Actual (Validation 2020)") +
  theme_minimal()

ggplotly(p_poly_valid, tooltip = "text")

4.4 Model Evaluation

# Actual PM10 values for validation data
actuals_poly_valid_best <- valid_data$pm10_concentration

# Root Mean Squared Error (RMSE)
rmse_poly_valid_best <- sqrt(mean((predictions_poly_valid_best - actuals_poly_valid_best)^2))

# Mean Absolute Error (MAE)
mae_poly_valid_best <- mean(abs(predictions_poly_valid_best - actuals_poly_valid_best))

# R-squared (proportion of variance explained)
r2_poly_valid_best <- 1 - sum((predictions_poly_valid_best - actuals_poly_valid_best)^2) / 
                      sum((actuals_poly_valid_best - mean(actuals_poly_valid_best))^2)

# Print all metrics
cat("RMSE (Polynomial - Validation - Best):", rmse_poly_valid_best, "\n")
## RMSE (Polynomial - Validation - Best): 4.67848
cat("MAE (Polynomial - Validation - Best):", mae_poly_valid_best, "\n")
## MAE (Polynomial - Validation - Best): 3.421604
cat("R² (Polynomial - Validation - Best):", r2_poly_valid_best, "\n")
## R² (Polynomial - Validation - Best): 0.5659805

5 RBF (Radial Basis Function) Kernel SVR

# libraries used 
#library(e1071)
#library(ggplot2)
#library(plotly)
#library(stringi)

# Create UTF-8 copies for tooltips only
tooltip_city <- iconv(valid_data$city, from = "", to = "UTF-8", sub = "?")
tooltip_country <- iconv(valid_data$country_name, from = "", to = "UTF-8", sub = "?")

# Fit RBF SVR on training data (2013–2019)
svr_rbf_model <- svm(
  pm10_concentration ~ year * no2_concentration + population + latitude + longitude,
  data = train_data,
  type = "eps-regression",
  kernel = "radial",
  gamma = 0.01,  # example
  cost = 1      # example
)

# Predict PM10 for validation data (2020)
predictions_rbf_valid <- predict(svr_rbf_model, newdata = valid_data)

# Static plot
ggplot(valid_data, aes(x = pm10_concentration, y = predictions_rbf_valid)) +
  geom_point(color = "darkgreen", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Actual PM10", y = "Predicted PM10",
       title = "RBF SVR Predictions vs Actual (Validation 2020)") +
  theme_minimal()

# Interactive plot with tooltips
p_rbf_valid <- ggplot(valid_data, aes(
  x = pm10_concentration,
  y = predictions_rbf_valid,
  text = paste0("City: ", tooltip_city, "<br>Country: ", tooltip_country)
)) +
  geom_point(color = "darkgreen", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Actual PM10", y = "Predicted PM10",
       title = "RBF SVR Predictions vs Actual (Validation 2020)") +
  theme_minimal()

ggplotly(p_rbf_valid, tooltip = "text")

5.1 Model Evaluation

# Compute validation metrics
actuals_rbf_valid <- valid_data$pm10_concentration
rmse_rbf_valid <- sqrt(mean((predictions_rbf_valid - actuals_rbf_valid)^2))
mae_rbf_valid <- mean(abs(predictions_rbf_valid - actuals_rbf_valid))
r2_rbf_valid <- 1 - sum((predictions_rbf_valid - actuals_rbf_valid)^2) / 
                sum((actuals_rbf_valid - mean(actuals_rbf_valid))^2)

cat("RMSE (RBF - Validation):", rmse_rbf_valid, "\n")
## RMSE (RBF - Validation): 4.529935
cat("MAE (RBF - Validation):", mae_rbf_valid, "\n")
## MAE (RBF - Validation): 3.497689
cat("R2  (RBF - Validation):", r2_rbf_valid, "\n")
## R2  (RBF - Validation): 0.5931039

This is already better than polynomial kernel

##Hyperparameter Tuning for the RBF Kernel

# Hyperparameter grid for RBF kernel
grid_rbf <- expand.grid(
  gamma = c(0.01, 0.1, 0.5),
  cost  = c(1, 5, 10)
)

# Create an empty results table
results_rbf <- data.frame(
  gamma = numeric(),
  cost  = numeric(),
  RMSE  = numeric(),
  MAE   = numeric(),
  R2    = numeric()
)

# Loop through all combinations
for (i in 1:nrow(grid_rbf)) {
  g <- grid_rbf$gamma[i]
  k <- grid_rbf$cost[i]
  
  # Fit RBF SVR on training data
  model <- svm(
    pm10_concentration ~ year * no2_concentration + population + latitude + longitude,
    data = train_data,
    type = "eps-regression",
    kernel = "radial",
    gamma = g,
    cost  = k
  )
  
  # Predict on validation data
  pred_valid <- predict(model, newdata = valid_data)
  actuals <- valid_data$pm10_concentration
  
  # Compute metrics
  rmse <- sqrt(mean((pred_valid - actuals)^2))
  mae  <- mean(abs(pred_valid - actuals))
  r2   <- 1 - sum((pred_valid - actuals)^2) / sum((actuals - mean(actuals))^2)
  
  # Store results
  results_rbf <- rbind(results_rbf, data.frame(
    gamma = g,
    cost  = k,
    RMSE  = rmse,
    MAE   = mae,
    R2    = r2
  ))
}

# Arrange by RMSE ascending
results_rbf <- results_rbf %>% arrange(RMSE)

# Show the table
print(results_rbf)
##   gamma cost     RMSE      MAE        R2
## 1  0.10    1 4.375005 3.267978 0.6204607
## 2  0.10    5 4.396605 3.126451 0.6167037
## 3  0.01   10 4.425666 3.400929 0.6116199
## 4  0.01    5 4.441561 3.418860 0.6088253
## 5  0.10   10 4.506137 3.130237 0.5973679
## 6  0.01    1 4.529935 3.497689 0.5931039
## 7  0.50    1 4.734048 3.441449 0.5556092
## 8  0.50    5 4.859162 3.578828 0.5318098
## 9  0.50   10 4.898298 3.636437 0.5242377
# Best RMSE
best_rmse <- results_rbf %>% slice_min(RMSE, n = 1)
cat("Best RMSE:\n")
## Best RMSE:
print(best_rmse)
##   gamma cost     RMSE      MAE        R2
## 1   0.1    1 4.375005 3.267978 0.6204607
# Best MAE
best_mae <- results_rbf %>% slice_min(MAE, n = 1)
cat("\nBest MAE:\n")
## 
## Best MAE:
print(best_mae)
##   gamma cost     RMSE      MAE        R2
## 1   0.1    5 4.396605 3.126451 0.6167037
# Best R² (highest)
best_r2 <- results_rbf %>% slice_max(R2, n = 1)
cat("\nBest R²:\n")
## 
## Best R²:
print(best_r2)
##   gamma cost     RMSE      MAE        R2
## 1   0.1    1 4.375005 3.267978 0.6204607

5.2 Final RBF Kernel Model

# Fit RBF SVR on training data (2010–2019) with best hyperparameters
svr_rbf_model_best <- svm(
  pm10_concentration ~ year * no2_concentration + population + latitude + longitude,
  data = train_data,
  type = "eps-regression",
  kernel = "radial",
  gamma = 0.1,  # best gamma from validation
  cost = 1      # best cost from validation
)

# Predict PM10 for validation data (2020) using best model
predictions_rbf_valid_best <- predict(svr_rbf_model_best, newdata = valid_data)

# Static plot
ggplot(valid_data, aes(x = pm10_concentration, y = predictions_rbf_valid_best)) +
  geom_point(color = "darkgreen", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Actual PM10", y = "Predicted PM10",
       title = "RBF SVR Predictions vs Actual (Validation 2020)") +
  theme_minimal()

# Interactive plot with tooltips
p_rbf_valid_best <- ggplot(valid_data, aes(
  x = pm10_concentration,
  y = predictions_rbf_valid_best,
  text = paste0("City: ", tooltip_city, "<br>Country: ", tooltip_country)
)) +
  geom_point(color = "darkgreen", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Actual PM10", y = "Predicted PM10",
       title = "RBF SVR Predictions vs Actual (Validation 2020)") +
  theme_minimal()

ggplotly(p_rbf_valid_best, tooltip = "text")

5.3 Model Evaluation

# Actual PM10 values for validation data
actuals_rbf_valid_best <- valid_data$pm10_concentration

# Root Mean Squared Error (RMSE)
rmse_rbf_valid_best <- sqrt(mean((predictions_rbf_valid_best - actuals_rbf_valid_best)^2))

# Mean Absolute Error (MAE)
mae_rbf_valid_best <- mean(abs(predictions_rbf_valid_best - actuals_rbf_valid_best))

# R-squared (proportion of variance explained)
r2_rbf_valid_best <- 1 - sum((predictions_rbf_valid_best - actuals_rbf_valid_best)^2) / 
                     sum((actuals_rbf_valid_best - mean(actuals_rbf_valid_best))^2)

# Print all metrics
cat("RMSE (RBF - Validation - Best):", rmse_rbf_valid_best, "\n")
## RMSE (RBF - Validation - Best): 4.375005
cat("MAE (RBF - Validation - Best):", mae_rbf_valid_best, "\n")
## MAE (RBF - Validation - Best): 3.267978
cat("R² (RBF - Validation - Best):", r2_rbf_valid_best, "\n")
## R² (RBF - Validation - Best): 0.6204607